Going back to step 01, let's review our goals.
Objectives This project focuses on quantitative data science and mainly aims to perform two goals. One is to predict housing price based on multiple variables such as room size or building types etc. The other is to forecast sales of a real estate company.
These goals is the time series forecast part and is comlpeted in the differet file. Here, we focus on teh former goal, which is to make a financial model for a housnig price.
Using this model, we analyze some housing price transaction in 2018. Again, we are assuming we were in Jan, 2018.
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from library.sb_utils import save_file
# For normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# For metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score
%matplotlib inline
%pdb 0
# Set the number of display
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', -1) # or 199
pd.options.display.float_format = '{:.4f}'.format
def load_CVmodel(model_path,expected_model_version = '1.0'):
import pickle
from sklearn import __version__ as sklearn_version
if os.path.exists(model_path):
with open(model_path, 'rb') as f:
model = pickle.load(f)
print("Expected model version has been loaded")
if model.version != expected_model_version:
print("Expected model version doesn't match version loaded")
if model.sklearn_version != sklearn_version:
print("Warning: model created under different sklearn version")
return model
else:
print("Expected model not found. Run the model fit")
# https://github.com/jbmouret/matplotlib_for_papers/blob/master/src/plot_variance_matplotlib_white.py
def clean_layout(ax):
# Delete unnecessary lines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
try:
# Remove frame color
frame = ax.legend_.get_frame()
frame.set_facecolor('1.0')
frame.set_edgecolor('1.0')
except:
pass
# y axis grid line
ax.grid(axis='y', color="0.9", linestyle='-', linewidth=1)
ax.set_axisbelow(True)
df0 = pd.read_csv('../data/df_data_step3b_EDA.csv')
df_sales = pd.read_csv('../data/time_sales_data_step3b_EDA.csv',index_col=None)
ModelComparison = pd.read_csv('../data/ModelComparison_data_step4_EDA.csv',index_col=None)
# This is necessary for making pipeline
dummy_step04 = pd.read_csv('../data/dummies_in_step4_EDA.csv',index_col=None)
df0.head()
# check the best stack model and the models that used in the stack model
ModelComparison.iloc[[7,9,10]]
The model we use is Stacking ensemble, which is based on the other two models.
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
As we are assuming we were in Jan, 2016, the data is set to only 2017.
# Drop the columns unnecessary
class DropColumns(BaseEstimator):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X: pd.DataFrame, y=None):
X = X.drop(['url','id','Cid','tradeTime', 'totalPrice'],axis=1) # The model was only trained on Metropolitan
return X
The process is the same as step04
dummy_step04.columns
class DummyColumns(BaseEstimator):
def _init__(self):
pass
def fit(self,X, y=None):
return self
def transform(self, X, y=None):
# Set feature values to make dummies for
ls_label = ['buildingType','elevator','renovationCondition' , 'buildingStructure', 'fiveYearsProperty', 'subway', 'district', 'floorType', 'DOM_cat']
dummies_columns = dummy_step04.columns
dummies = pd.get_dummies(X[ls_label].astype(str),
# drop_first=True
)
dummies = dummies.reindex(columns = dummies_columns, fill_value=0)
res = pd.concat([X, dummies], axis=1)
res = res.drop(ls_label, axis=1)
return res
model_path = '../models/stack_opt.pkl'
stack = load_CVmodel(model_path)
from sklearn.pipeline import make_pipeline,Pipeline
# structure_pipeline = Pipeline([
# ("DropColumns", DropColumns()),
# ("DummyColumns", DummyColumns()),
# ])
financialModel_1_pipe = Pipeline([
("DropColumns", DropColumns()),
("DummyColumns", DummyColumns()),
("stack",stack),
])
def make_prediction(df,model_pipeline, model_name:str):
# Create train and test data set
# This is for modeling, which is the data in 2018
df_train = df.drop(['price'],axis=1)
df_test = df[['price']]
# data = structure_pipeline.transform(df_train) # Save the transformation to a separate df
predictions = model_pipeline.predict(df_train).reshape(-1,1) # Make the prediction on the original df
# Append the prediction to the separate transformed data
# data[f'preidcitons_price'] = y_scaler.inverse_transform(predictions)
data = df.copy()
data['preidcitons_price'] = predictions
data.rename(columns={'preidcitons_price':f'modelPrice_{model_name}'}, inplace=True)
# data['price'] = df_test
return data
price_prediction_stack = make_prediction(df0[df0.year==2018],financialModel_1_pipe, 'stack')
price_prediction_stack.tail()
# Difference between price and model price
def calModelGap(df):
df['gap'] = df['price']-df['modelPrice_stack']
calModelGap(price_prediction_stack)
sns.histplot(price_prediction_stack['gap'],kde=True)
sns.despine(left=True);
This tells price in actual transaction is more inexpensive than listed price.
Let's check the the worst 5.
price_prediction_stack.sort_values(by='gap',ascending=True,)[['price','modelPrice_stack','gap']].head()
These houses will be found by making use of standard deviation.
sigma = price_prediction_stack['gap'].std()
sigma
price_prediction_stack['priceTooLow'] = price_prediction_stack['gap'].apply(lambda x: 1 if x<-2*sigma else 0)
tooLow = price_prediction_stack[price_prediction_stack.priceTooLow==1]
tooLow.head()
tooLow_new = tooLow.iloc[:,:31].copy()
tooLow_new.renovationCondition = int(4)
tooLow_new.head()
tooLow_new_reno4 = make_prediction(tooLow_new,financialModel_1_pipe, 'stack')
tooLow_new_reno4[['renovationCondition','modelPrice_stack']]
len(tooLow_new_reno4[['renovationCondition','modelPrice_stack']])
tooLow['reno_4'] = tooLow_new_reno4['modelPrice_stack']
table1 = tooLow[['renovationCondition','modelPrice_stack','reno_4']]
# Expected increase in total sales
tooLow_new_reno4['modelPrice_stack'].sum() - tooLow_new_reno4['price'].sum()
# calculate the gap between before-price and after-price
table1['gap'] = table1['reno_4']-table1['modelPrice_stack']
# filter out the houses which prices can be improved by hard floor renovation
temp = table1[table1['gap'] > table1['gap'].std()]
# rename the table
tempp = temp.rename(columns={
'renovationCondition':'Condition before renovation',
'modelPrice_stack':'price before',
'reno_4':'price after',
})
tempp['percent increase'] = tempp['gap']/tempp['price before']*100
#
tempp
#
tempp.describe()
sns.boxplot(x="renovationCondition", y="gap", data=price_prediction_stack)
sns.stripplot(x="renovationCondition", y="gap", data=price_prediction_stack, marker="o", alpha=0.3, color="black", order=[1,2,3,4]);
sns.despine(left=True);
Definition of reonvation condition category
sns.boxplot(x="renovationCondition", y="modelPrice_stack", data=price_prediction_stack,)
sns.stripplot(x="renovationCondition", y="modelPrice_stack", data=price_prediction_stack, marker="o", alpha=0.3, color="black", order=[1,2,3,4]);
sns.despine(left=True);
According to this, there seem to be no difference between Hardcover renovationa (cat4) and Simplicity renovation (cat3). If this is true, there is no merit to spend extra money for renovation.
Next, we statistically test this.
# Create a column called `Permutation1`, and assign to it the result of permuting (shuffling) the price column
price_prediction_stack['Permutation1'] = np.random.permutation(price_prediction_stack.price)
# Call the describe() method on our permutation grouped by 'platform'.
price_prediction_stack.groupby('renovationCondition')[['Permutation1']].describe()
price_prediction_stack.groupby('renovationCondition')[['modelPrice_stack','Permutation1']].describe().T
price_prediction_stack.groupby('renovationCondition')[['Permutation1']].mean()
renoCat1 = price_prediction_stack[price_prediction_stack.renovationCondition==1].modelPrice_stack
renoCat2 = price_prediction_stack[price_prediction_stack.renovationCondition==2].modelPrice_stack
renoCat3 = price_prediction_stack[price_prediction_stack.renovationCondition==3].modelPrice_stack
renoCat4 = price_prediction_stack[price_prediction_stack.renovationCondition==4].modelPrice_stack
from scipy import stats
renoCat3_normal = stats.normaltest(renoCat3)
renoCat3_normal
renoCat4_normal = stats.normaltest(renoCat4)
renoCat4_normal
Both are not normal.
Our Null hypothesis is just:
H null: the observed difference in the mean Hardcover renovationa (cat4) and mean Simplicity renovation (cat3) is due to chance (and thus not due to the renovation).
H alternative: the observed difference is not due to chance (and is actually due to renovation)
We're also going to pick a significance level of 0.05.
Is there wny difference among categories? Here, we perform permutaiton test. The data to use is modelPrice_stack predicted by the financial model.
sns.histplot(renoCat3,kde=True,color='r')
sns.histplot(renoCat4,kde=True,color='b')
sns.despine(left=True);
# Another way per the Datacamp in sec.11.3
np.random.seed(42)
def permutation_sample(data1, data2):
"""Generate a permutation sample from two data sets."""
# Concatenate the data sets: data
data = np.concatenate((data1, data2))
# Permute the concatenated array: permuted_data
permuted_data = np.random.permutation(data)
# Split the permuted array into two: perm_sample_1, perm_sample_2
perm_sample_1 = permuted_data[:len(data1)]
perm_sample_2 = permuted_data[len(data1):]
return perm_sample_1, perm_sample_2
def diff_of_means(data_1, data_2):
"""Difference in means of two arrays."""
# The difference of means of data_1, data_2: diff
diff = np.mean(data_1)-np.mean(data_2)
return diff
def draw_perm_reps(data_1, data_2, func, size=1):
"""Generate multiple permutation replicates."""
# Initialize array of replicates: perm_replicates
perm_replicates = np.empty(size)
for i in range(size):
# Generate permutation sample
perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)
# Compute the test statistic
perm_replicates[i] = func(perm_sample_1, perm_sample_2)
return perm_replicates
def permutationTest(a,b):
# Compute difference of mean impact force from experiment: empirical_diff_means
empirical_diff_means = diff_of_means(a,b)
# Draw 10,000 permutation replicates: perm_replicates
perm_replicates = draw_perm_reps(a,b,diff_of_means, size=10000)
# Compute p-value: p
# two-tailed
p = np.sum(perm_replicates >= empirical_diff_means)/ len(perm_replicates)
# Print the result
print('empirical_diff_means ',empirical_diff_means)
print('p-value =', p)
plt.hist(perm_replicates)
plt.axvline(empirical_diff_means, color='r')
permutationTest(renoCat3,renoCat4)
sns.despine(left=True);
There is no difference between 3 and 4 renovationCondition. Based on this result, we cannot deny the null hypoothesis that the distributions are same. The benefit of renovation is not significant in terms of price appreciation.
import random
from scipy.stats import ttest_ind
ttest,pval = ttest_ind(renoCat3,renoCat4, equal_var = False)
print("ttest",ttest)
print('p value',format(pval, '.70f'))
if pval <0.05:
print("we reject null hypothesis")
else:
print("we accept null hypothesis")
One of the methods to increase a housing price is renovation. How much appreciation in price will be expected if Hardcover renovation (cat4) are applied to all houses? Typical renovation is done within one month, and only concern here is to check if the improvement in price exceeds by 15% or not.
# Make a dataframe for this question
dfQ2 = df0[df0.year==2018]
# Change therenovationCondition to 4
dfQ2.renovationCondition = 4
dfQ2.head()
# Predict new prices
Q2 = make_prediction(dfQ2,financialModel_1_pipe, 'stack')
Q2.rename(columns={'modelPrice_stack':'price_renovation3to4'}, inplace=True)
price_prediction_stack = pd.concat([price_prediction_stack,Q2.iloc[:,-1:]],axis=1)
price_prediction_stack['renovTo4_pct'] =\
(price_prediction_stack.price_renovation3to4-price_prediction_stack.modelPrice_stack)/price_prediction_stack.modelPrice_stack*100
price_prediction_stack.head()
f =(price_prediction_stack.renovationCondition==1)|(price_prediction_stack.renovationCondition==2)|(price_prediction_stack.renovationCondition==3)
price_prediction_stack[f][['renovTo4_pct']].describe().T
The expected appreciation is as following.
Based on these, standard deviation is same level as its mean. According to this model, the average improvement in price by renovation is 2.0%. Therefore, if renovation cost is less than sales price times this 2.0%, the renovation may be worth an invest, but it si difficut to say that because the standard deviation is as high as 2.0.
Let's find the houses that should be renovated with teo sigma confidence.
sigma = price_prediction_stack[f].renovTo4_pct.std()
sigma
# filter out the only price_renovation3to4 that have significant level of positive percentage increase (price_renovation3to4) by more than two sigma.
price_prediction_stack[price_prediction_stack.renovTo4_pct>sigma*2]
Those houses are financial-modelically worthwhile investment of renovation.
# Add a new column of boolean if renovTo4_pct is positive or not.
def posiNega(x):
if x > 0:
return 1
else:
return 0
price_prediction_stack['renoPlus'] = price_prediction_stack['renovTo4_pct'].apply(posiNega)
price_prediction_stack.head()
features = ['followers',
'totalPrice', 'square', 'livingRoom', 'drawingRoom', 'kitchen',
'bathRoom', 'buildingType', 'constructionTime',
'buildingStructure', 'ladderRatio', 'elevator', 'fiveYearsProperty',
'subway', 'district', 'year', 'month', 'floorType', 'floorHeight',
'DOM_cat', '10yrBond_up%', 'hmi_up%', 'sse_up%', 'modelPrice_stack',
'renoPlus']
# sns.pairplot(price_prediction_stack[features], hue="renoPlus");
Using pairplot, it is found that the columns, 'livingRoom', 'buildingType', 'followers', had more difference between renoPlus boolean than the others.
Next, check the histgrams of these columns to find which houses are better to renovate.
f, ax = plt.subplots(1, 3, figsize=(12, 6), sharex=False)
for i,_ in enumerate(['livingRoom', 'buildingType', 'followers']):
sns.histplot(data=price_prediction_stack, x=_, multiple="dodge", hue="renoPlus",kde=True,ax=ax[i])
sns.despine(left=True)
a = price_prediction_stack[price_prediction_stack.renoPlus==0].livingRoom
b = price_prediction_stack[price_prediction_stack.renoPlus==1].livingRoom
permutationTest(a,b)
sns.despine(left=True);
a = price_prediction_stack[price_prediction_stack.renoPlus==0].buildingType
b = price_prediction_stack[price_prediction_stack.renoPlus==1].buildingType
permutationTest(a,b)
sns.despine(left=True);
a = price_prediction_stack[price_prediction_stack.renoPlus==0].followers
b = price_prediction_stack[price_prediction_stack.renoPlus==1].followers
permutationTest(a,b)
sns.despine(left=True);
We conclude that renovation will bring significant difference in followers.
price_prediction_stack[['gap']].describe().T
sig = np.std(price_prediction_stack['gap'])
# price - modelPrice_stack is too low(-)
len(price_prediction_stack[price_prediction_stack.gap<-2*sig].sort_values(by='gap'))
# price - modelPrice_stack is too high(+)
# houses which listed prices are much lower thatn predicted model
price_prediction_stack[price_prediction_stack.gap>1.9*sig].sort_values(by='gap',ascending = False).head(10)
Those may be worthwhile to review and revise those prices.
There are not so many feature values that real estate company can change regarding their product. One of them is renovation, and we have checked how the renovation will contribute to the increase in house price. Our conclusion is that it will unfortunately not bring signicant diffference in the appreciation in housing price, it can be said that it may be brive to cut off renovatio cost as reasonably as possible.
In addition to the renovation factor, we also filter out the houses which prices had better to be reconsidered.